Environment Setting

suppressMessages({
  library(data.table)
  library(dplyr)
  library(plotly)
  library(abn)
})

Data Acquisition

var_path <- "data/20230311_tips.csv"

data <- var_path %>% fread()

Pre-proccess

data <- data %>% mutate(
  sex = sex %>% factor(., c("Female", "Male")),
  smoker = smoker %>% factor(., c("No", "Yes")),
  day = day %>% factor(., c("Thur", "Fri", "Sat", "Sun")),
  time = time %>% factor(., c("Lunch", "Dinner"))
)

Check Data

data %>% skimr::skim()
Data summary
Name Piped data
Number of rows 244
Number of columns 7
Key NULL
_______________________
Column type frequency:
factor 4
numeric 3
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
sex 0 1 FALSE 2 Mal: 157, Fem: 87
smoker 0 1 FALSE 2 No: 151, Yes: 93
day 0 1 FALSE 4 Sat: 87, Sun: 76, Thu: 62, Fri: 19
time 0 1 FALSE 2 Din: 176, Lun: 68

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
total_bill 0 1 19.79 8.90 3.07 13.35 17.8 24.13 50.81 ▃▇▃▁▁
tip 0 1 3.00 1.38 1.00 2.00 2.9 3.56 10.00 ▇▆▂▁▁
size 0 1 2.57 0.95 1.00 2.00 2.0 3.00 6.00 ▇▂▂▁▁

Visualization

3D Scatter

tip ~ size + total_bill

data %>%
  plot_ly(x = ~size, y = ~total_bill, z = ~tip, type = "scatter3d", mode = "markers")

tip ~ size + total_bill + smoker

data %>%
  plot_ly(x = ~size, y = ~total_bill, z = ~tip, color = ~smoker, type = "scatter3d", mode = "markers")

Scatter with lm

tip ~ size + total_bill + day +smoker + sex

ggplotly(
  data %>%
    mutate(
      day = day %>% paste("day =", .),
      smoker = smoker %>% paste("smoker =", .)
    ) %>%
    ggplot(aes(x = total_bill, y = tip, color = sex)) +
    geom_smooth(method = "lm", se = F, size = .5, linetype = 2, formula = ~"y ~ x") +
    geom_point(alpha = .8) +
    facet_wrap(smoker ~ day, ncol = 4, scales = "free") +
    scale_x_continuous(labels = scales::label_comma()) +
    scale_y_continuous(labels = scales::label_comma()) +
    colorspace::scale_color_discrete_diverging("Green-Orange") +
    colorspace::scale_fill_discrete_diverging("Green-Orange") +
    ggthemes::theme_fivethirtyeight() +
    theme(legend.position = "bottom")
)

Box plot

Scatter with lm: tip ~ total_bill + day + sex

ggplotly(data %>%
  mutate(
    day = day %>% paste("day =", .),
    smoker = smoker %>% paste("smoker =", .)
  ) %>%
  ggplot(aes(x = sex, y = tip, color = sex)) +
  geom_boxplot() +
  facet_wrap(~day, ncol = 4, scales = "free") +
  scale_y_continuous(labels = scales::label_comma()) +
  colorspace::scale_color_discrete_diverging("Green-Orange") +
  colorspace::scale_fill_discrete_diverging("Green-Orange") +
  ggthemes::theme_fivethirtyeight() +
  theme(legend.position = "bottom"))

Hypothesis testing

Structure Discover

data_abn <- data %>%
  mutate(day = forcats::fct_recode(day, "Weekday" = "Thur", "Weekday" = "Fri", "Weekend" = "Sat", "Weekend" = "Sun")) %>%
  mutate_at(.vars = c("total_bill", "tip"), ~ log(.)) %>%
  data.frame()

data.dists <- list(
  "total_bill" = "gaussian",
  "tip" = "gaussian",
  "sex" = "binomial",
  "smoker" = "binomial",
  "day" = "binomial",
  "time" = "binomial",
  "size" = "gaussian"
)

data_abn %>%
  buildScoreCache(data.df = ., data.dists = data.dists, max.parents = 1) %>%
  mostProbable() %>%
  fitAbn() %>%
  plot()
## Step1. completed max alpha_i(S) for all i and S
## Total sets g(S) to be evaluated over: 128

Regression

list_model <- c("total_bill", "size", "total_bill + size") %>%
  paste("tip ~", .) %>%
  c(., "size ~ total_bill") %>%
  lapply(., as.formula) %>%
  lapply(., function(x) {
    glm(data = data_abn, formula = x)
  })

suppressMessages({
  list_model %>%
    jtools::export_summs(.,
      error_format = "[{conf.low}, {conf.high}]",
      model.names = c("total bill", "size", "both", "size by total bill")
    )
})
total billsizebothsize by total bill
(Intercept)-0.95 ***0.44 ***-0.89 ***-1.12 ***
[-1.22, -0.68]   [0.30, 0.57]   [-1.16, -0.61]   [-1.77, -0.48]   
total_bill0.68 ***       0.60 ***1.28 ***
[0.58, 0.77]          [0.49, 0.72]   [1.06, 1.50]   
size       0.22 ***0.06 *         
       [0.17, 0.27]   [0.00, 0.11]          
N244       244       244       244       
AIC141.35    228.26    138.81    568.73    
BIC151.85    238.75    152.80    579.23    
Pseudo R20.67    0.34    0.68    0.37    
*** p < 0.001; ** p < 0.01; * p < 0.05.

Plot model fitness

total_bill

par(mfrow = c(2, 2))
list_model[[1]]%>%plot()

both

par(mfrow = c(2, 2))
list_model[[3]]%>%plot()